/**
 * 2017年5月17日
 */
package cn.edu.bjtu.workbench.datasource.fileiter;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Random;

import org.ansj.domain.Result;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.datavec.api.records.reader.impl.LineRecordReader;
import org.datavec.api.split.FileSplit;
import org.datavec.api.writable.Writable;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.sentenceiterator.SentencePreProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 读句子
 * @author Alex
 *
 */
public class WechatLineSentenceRecordReader extends LineRecordReader implements SentenceIterator{
	private static final long serialVersionUID = -4599612791106295846L;
	protected Logger log = LoggerFactory.getLogger(this.getClass());
	protected ToAnalysis ta = new ToAnalysis();
	int seq = 8;
	
	public WechatLineSentenceRecordReader() {
		this(8);
	}
	
	public WechatLineSentenceRecordReader(int contentSeq){
		seq = 8;
	}
	
	@Override
	public String nextSentence() {
		List<Writable> l = this.next();
		String line = null;
		if(l != null && l.size()>0 ){
			try{
				line = l.get(0).toString();
				String s = line.split("\t")[seq];
				if(!canHandle(s))return null;
				else{
					log.info("unknow language at {},of split {} ,content {}",this.lineIndex,this.splitIndex,line);
				}
				Result re = ta.parseStr(s);
				return re.toStringWithOutNature(" ");
			}catch(Exception e){
				log.info("error foramt at {},of split {} ,content {}",this.lineIndex,this.splitIndex,line);
				return null;
			}
		}
		return null;
	}

	@Override
	public void finish() {
		try {
			super.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
	
	private boolean canHandle(String strName) {
		int len = strName.length();
		if (len < 30)
			return false;
		int sampleLen = len / 10;
		int errorCount = 0;
		Random r = new Random();
		char[] ch = strName.toCharArray();
		for (int i = 0; i < sampleLen; i++) {
			char c = ch[r.nextInt(len - 1)];
			if (!isChinese(c) && !isEnglish(c)) {
				errorCount++;
			}
		}
		if (errorCount * 1.0 / sampleLen > 0.5)
			return false;
		else
			return true;
	}
	// 根据Unicode编码完美的判断中文汉字和符号
	private boolean isChinese(char c) {
		Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
		if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
				|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
				|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
				|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
				|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
				|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
				|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
			return true;
		}
		return false;
	}

	private boolean isEnglish(char c) {
		return (c>='a' && c<='z') || (c>='A' && c<='Z');
	}
	public static void main(String[] args) throws IOException, InterruptedException {
		String file = "D:\\textdata\\userData\\c2\\1.txt";
		WechatLineSentenceRecordReader rr = new WechatLineSentenceRecordReader(8);
		rr.initialize(new FileSplit(new File(file)));
		while(rr.hasNext()){
			rr.nextSentence();
		}
	}
	
	@Override
	public SentencePreProcessor getPreProcessor() {
		return null;
	}

	@Override
	public void setPreProcessor(SentencePreProcessor preProcessor) {
	}

}
